library(stringr)
library(magrittr)
library(stm)
library(tm)
library(tmcn)
library(lda)
library(jiebaR)
library(topicmodels)
library(SnowballC)
library(ggplot2)
library(reshape2)
library(dplyr)
library(tidyr)
library(Matrix)
library(data.table)
library(scales)

##read in filenames =
setwd("~")
load("All.Rdata")

get.Dict = function(f){
  ## Get stopword dictionary with information entropy algorithm
  seg = worker(type = "mix", stop_word = "~/stopword.txt", encoding = "UTF-8")
  text = lapply(segged, function(x) segment(x, seg))
  text = lapply(text, function(x) paste(x, collapse = " "))
  lexicalized = lexicalize(text, sep = " ") ## lexicalise for the first time, erase words with one character and showed only once
  dic = lexicalized$vocab[word.counts(lexicalized$documents, lexicalized$vocab) >= 2] #erase the words only showed up once
  dic = dic[which(nchar(dic) != 1)] #erase the words with only one character
  documents <- lexicalize(text, lower = TRUE, vocab = dic) #lexicalize again with cleaner dictionary to calculate entropy
  
  ## IF NOT ENTROPY, KEEP = DIC DIRECTLY!!!
  keep = dic[-which(str_detect(dic, pattern = "[:digit:]"))]
  
  ## Entropy
  wc.doc = summary(documents)
  wc.doc = as.matrix(wc.doc)
  wc.doc = as.numeric(wc.doc[,1]) # the number of words in each document
  
  word.by.doc = lapply(documents, function(x) word.counts(x, vocab = dic)) 
  # a very sparse matrix with all the words in there. Each document is expressed as a vector of 0, 1 to infinitive.
  # this line may drag slow the entire programme
  
  word.list = list()
  for (i in 1:length(word.by.doc)){
    w = lapply(word.by.doc[i], function(y) (y/wc.doc[i]) * log((wc.doc[i])/y))
    word.list = append(word.list, c(w))
  }
  
  for (i in 1:length(word.list)){
    word.list[[i]][is.na(word.list[[i]])] = 0
  }
  # counted something for each of the document
  matrix = do.call(cbind, word.list[1:length(word.list)]) %>% as.matrix
  # A matrix with (number of document) columns and (number of words) rows
  entropy = rowSums(matrix) %>% sort()
  entropy = entropy[-which(str_detect(names(entropy), pattern = "[:digit:]"))] ## Remove numbers from the dictionary
  # A skimmed dictionary got: level of cleanness adjustable
  keep = names(entropy[which((entropy >= quantile(entropy, f)) & (entropy <= quantile(entropy, (1-f))))])
  return(keep)
}
keep = get.Dict(0)

##segged is a list of text body
##info is a data frame of meta information: both for stm and for DublinCore
##Clear the infomation session in the body
##Tag
corpus = VCorpus(VectorSource(segged))
for (i in 1:length(corpus)){
  DublinCore(corpus[[i]], tag = "date") = info$date[i]
  DublinCore(corpus[[i]], tag = "creator") = info$creator[i]
  DublinCore(corpus[[i]], tag = "identifier") = info$id[i]
  DublinCore(corpus[[i]], tag = "title") = info$title[i]
  DublinCore(corpus[[i]], tag = "type") = info$level[i]
}

corpus.word.count = lapply(lapply(corpus$content, nchar), function(x) x = x[-2]) %>% unlist #calculate word count for each word
corpus = corpus[-which(corpus.word.count < quantile(corpus.word.count, .05) | 
                         corpus.word.count > quantile(corpus.word.count, .95))] # trim the top and bottom 5% of the documents in terms of word counts

##Segment
seg = worker(type = "mix")
jieba_tokenizer = function(d){
  unlist(segment(d[[1]], seg)[which(segment(d[1][[1]],seg) %in% keep)])
}
control.list = list(wordLengths = c(2,Inf), weighting = "weightTfIdf", tokenize = jieba_tokenizer)    ##only words with more than two characters
dtm = DocumentTermMatrix(corpus, control = control.list)
rownames(dtm) = unlist(DublinCore(corpus)$title)

# Get rid of empty rows in the dtm and its associated corpus
dtm = removeSparseTerms(dtm, 0.9994)

# Remove documents without words
di = dtm$i %>% unique
kept = 1:length(corpus) %in% di
corpus = corpus[kept]
dtm = dtm[di,]
shortdocs = lapply(corpus, function(x) x$content)
rm(di)
rm(kept)

# Convert to STM format data
processed = readCorpus(dtm, 'slam')

processed$meta = data.frame(identifier = DublinCore(corpus, tag = "identifier") %>% unlist,
                            date = DublinCore(corpus, tag = "date") %>% unlist %>% as.Date,
                            creator = DublinCore(corpus, tag = "creator") %>% unlist,
                            title = DublinCore(corpus, tag = "title") %>% unlist,
                            type = DublinCore(corpus, tag = "type") %>% unlist
)

processed$meta$days = as.numeric(processed$meta$date - as.Date("2008-01-01"))

out = prepDocuments(processed$documents, processed$vocab, processed$meta)
###

#plot document frequency by date 
par(mfrow= c(1,1))
freq = table(cut(out$meta$date, breaks = 'week'))
freq = as.data.frame(freq)
names(freq) = c('time', 'freq')
freq$time = freq$time %>% as.Date()
findPreviousSunday = function(day){
  day = as.Date(day)
  prev.days = seq(day-6, day, by = 'day')
  Sunday = prev.days[weekdays(prev.days) == 'Sunday']
  return(Sunday)
}
ggplot(freq, aes(x = time, y = freq)) + geom_bar(stat = 'identity', width = 7) + 
  xlab('Week') + ylab('Num. of Articles') +
  #  geom_vline(xintercept = findPreviousSunday('2010-01-16'), colour = '#BB0000') + 
  geom_vline(xintercept = findPreviousSunday('2013-11-18'), colour = '#BB0000') + 
  geom_vline(xintercept = findPreviousSunday('2015-10-29'), colour = '#BB0000') 
rm(freq)




### STM
M = stm(documents = out$documents,
        vocab = out$vocab,
        K = 48,
        prevalence = ~s(days) + type,
        max.em.its = 10000,
        data = out$meta,
        init.type = 'Spectral',
        seed = 2123)
labelTopics(M, n = 20)
co_days = estimateEffect(1:48 ~ s(days) + type, M, meta = out$meta, uncertainty = 'Global')
plot(M, type = 'summary', xlim = c(0, .1))
topic_prop = (M$theta %>% colSums) / (M$theta %>% colSums %>% sum)

inx = c(39,40,38,15,42,7,6,44,16,24,29,11,37,4,10,22,18,25,46,14,2,12,17,36,5,21,33,35,8,26,45,32,23,48,20,30,28,1,13,34,43,41,31,9,19,47,3,27)
topic_prop = topic_prop[inx] * 100

# Plot results / by day
plot_topics = function(i){
  plot(co_days, 
       'days', 
       method = 'continuous', 
       topics = inx[i], 
       printlegend = F, 
       xaxt = 'n',
       yaxt = 'n',
       ylim = c(0, 0.1),
       xlab = NA,
       ylab = NA)
  abline(v = as.Date('2010-01-16') - as.Date('2008-01-01'), col = 'blue')
  abline(v = as.Date('2013-11-12') - as.Date('2008-01-01'), col = 'blue')
  abline(v = as.Date('2015-12-27') - as.Date('2008-01-01'), col = 'blue')
  ## Without Labels 
  # monthseq = seq(from = as.Date('2008-01-01'), to = as.Date('2017-12-31'), by = 'month')
  # monthnames = substr(monthseq, 1, 7)
  # axis(1, at = as.numeric(monthseq) - min(as.numeric(monthseq)), labels = monthnames)
}
par(mfrow = c(3,4))
for (i in 1:12) {plot_topics(i)}
for (i in 13:24) {plot_topics(i)}
for (i in 25:36) {plot_topics(i)}
for (i in 37:48) {plot_topics(i)}

#plot results / by type
par(mfrow=c(1,1))
plot(co_days, covariate = 'type', topics = inx[1:48], model = M, method = 'difference', cov.value1 = 'provincial', cov.value2 = 'central',
     xlab = 'Central ... Provincial', main = 'Effect of Central vs. Provincial', xlim = c(-.1, .1), labeltype = 'custom', custom.labels = c(paste('topic', 1:48)))
plot(co_days, covariate = 'type', topics = inx[1:48], model = M, method = 'difference', cov.value1 = 'municipal', cov.value2 = 'provincial',
     xlab = 'Provincial ... Municipal', main = 'Effect of Provincial vs. Municipal', xlim = c(-.1, .1), labeltype = 'custom', custom.labels = c(paste('topic', 1:48)))
plot(co_days, covariate = 'type', topics = inx[1:48], model = M, method = 'difference', cov.value1 = 'municipal', cov.value2 = 'central',
     xlab = 'Central ... Municipal', main = 'Effect of Central vs. Municipal', xlim = c(-.1, .1), labeltype = 'custom', custom.labels = c(paste('topic', 1:48)))
plot(co_days, covariate = 'type', topics = inx[1:48], model = M, method = 'difference', cov.value1 = 'rural', cov.value2 = 'provincial',
     xlab = 'Urban ... Rural', main = 'Effect of Urban vs. Rural', xlim = c(-.1, .1), labeltype = 'custom', custom.labels = c(paste('topic', 1:48)))


#For each topic
i = 1 # i to be set
findThoughts(M, texts = shortdocs, n = 30, topics = inx[i])$docs[[1]]
plot_topics(i)


#Aggregated
heat_data = matrix(nrow = 48, ncol = 100)
for (i in 1:48){
  tmp = plot(co_days, 'days', method = 'continuous', topics = inx[i])
  heat_data[i,] = c(tmp$means[[1]])
}
colnames(heat_data) = c(plot(co_days, 'days', method = 'continuous', topics = 1)$x)
rownames(heat_data) = 1:48 %>% as.character()
#plot
time.axis = seq.Date(from = as.Date('2008-01-01'), to = as.Date('2017-12-31'), by = '1 day')
index = heat_data %>% colnames %>% as.numeric %>% round
index[1] = 1
index = time.axis[index]
#plot
par(mfrow = c(1,1))
which(index == '2013-11-09')  #59
which(index == '2015-12-23')  #80
LP1 = heat_data[1:5,] %>% colSums
plot(LP1, type = 'l', xaxt = 'n', ylim = c(0, 0.3), xlab = 'Time', ylab = 'Discourse Prevalence')
axis(1, at = c(1,50,100), labels = c(index[1], index[50], index[100]))
abline(v = 59, col = 'blue')
abline(v = 80, col = 'blue')

TC1 = heat_data[6:8,] %>% colSums
plot(TC1, type = 'l', xaxt = 'n', ylim = c(0, 0.3), xlab = 'Time', ylab = 'Discourse Prevalence')
axis(1, at = c(1,50,100), labels = c(index[1], index[50], index[100]))
abline(v = 59, col = 'blue')
abline(v = 80, col = 'blue')

PSA1 = heat_data[9:13,] %>% colSums
plot(PSA1, type = 'l', xaxt = 'n', ylim = c(0, 0.3), xlab = 'Time', ylab = 'Discourse Prevalence')
axis(1, at = c(1,50,100), labels = c(index[1], index[50], index[100]))
abline(v = 59, col = 'blue')
abline(v = 80, col = 'blue')

PSA2 = heat_data[14:20,] %>% colSums
plot(PSA2, type = 'l', xaxt = 'n', ylim = c(0, 0.3), xlab = 'Time', ylab = 'Discourse Prevalence')
axis(1, at = c(1,50,100), labels = c(index[1], index[50], index[100]))
abline(v = 59, col = 'blue')
abline(v = 80, col = 'blue')

SC2 = heat_data[21:26,] %>% colSums
plot(SC2, type = 'l', xaxt = 'n', ylim = c(0, 0.3), xlab = 'Time', ylab = 'Discourse Prevalence')
axis(1, at = c(1,50,100), labels = c(index[1], index[50], index[100]))
abline(v = 59, col = 'blue')
abline(v = 80, col = 'blue')

dustbin = heat_data[c(27:48),] %>% colSums
plot(dustbin, type = 'l', xaxt = 'n', ylim = c(0, 0.5), xlab = 'Time', ylab = 'Discourse Prevalence')
axis(1, at = c(1,50,100), labels = c(index[1], index[50], index[100]))
abline(v = 59, col = 'blue')
abline(v = 80, col = 'blue')

#plot comparison with aggregated
#Levels for one
par(mfrow = c(1,3))
plot(co_days, covariate = 'type', topics = inx[1:13], model = M, method = 'difference', cov.value1 = 'provincial', cov.value2 = 'central',
     xlab = 'Central ... Provincial', main = 'Effect of Central vs. Provincial', xlim = c(-.1, .1), labeltype = 'custom', custom.labels = c(paste('topic', 1:13)))
plot(co_days, covariate = 'type', topics = inx[1:13], model = M, method = 'difference', cov.value1 = 'municipal', cov.value2 = 'provincial',
     xlab = 'Provincial ... Municipal', main = 'Effect of Provincial vs. Municipal', xlim = c(-.1, .1), labeltype = 'custom', custom.labels = c(paste('topic', 1:13)))
plot(co_days, covariate = 'type', topics = inx[1:13], model = M, method = 'difference', cov.value1 = 'municipal', cov.value2 = 'central',
     xlab = 'Central ... Municipal', main = 'Effect of Central vs. Municipal', xlim = c(-.1, .1), labeltype = 'custom', custom.labels = c(paste('topic', 1:13)))

#Levels for two
plot(co_days, covariate = 'type', topics = inx[14:26], model = M, method = 'difference', cov.value1 = 'provincial', cov.value2 = 'central',
     xlab = 'Central ... Provincial', main = 'Effect of Central vs. Provincial', xlim = c(-.1, .1), labeltype = 'custom', custom.labels = c(paste('topic', 14:26)))
plot(co_days, covariate = 'type', topics = inx[14:26], model = M, method = 'difference', cov.value1 = 'municipal', cov.value2 = 'provincial',
     xlab = 'Provincial ... Municipal', main = 'Effect of Provincial vs. Municipal', xlim = c(-.1, .1), labeltype = 'custom', custom.labels = c(paste('topic', 14:26)))
plot(co_days, covariate = 'type', topics = inx[14:26], model = M, method = 'difference', cov.value1 = 'municipal', cov.value2 = 'central',
     xlab = 'Central ... Municipal', main = 'Effect of Central vs. Municipal', xlim = c(-.1, .1), labeltype = 'custom', custom.labels = c(paste('topic', 14:26)))


#urban vs. rural
plot(co_days, covariate = 'type', topics = inx[1:13], model = M, method = 'difference', cov.value1 = 'rural', cov.value2 = 'provincial',
     xlab = 'Urban ... Rural', main = 'Effect of Urban vs. Rural (Pro-One-Child)', xlim = c(-.1, .1), labeltype = 'custom', custom.labels = c(paste('topic', 1:13)))
plot(co_days, covariate = 'type', topics = inx[14:26], model = M, method = 'difference', cov.value1 = 'rural', cov.value2 = 'provincial',
     xlab = 'Urban ... Rural', main = 'Effect of Urban vs. Rural (Pro-Two-Child)', xlim = c(-.1, .1), labeltype = 'custom', custom.labels = c(paste('topic', 14:26)))


### Curve to Coordinate system
df = cbind(LP1, TC1, PSA1) %>% as.data.frame()
df$SC1 = 0
df$x = (df$LP1 + df$SC1 - df$TC1 - df$PSA1) * 100 #-9.2
df$y = (df$LP1 + df$TC1 - df$PSA1 - df$SC1) * 100 #8.8

df = cbind(PSA2, SC2) %>% as.data.frame()
df$LP2 = 0
df$TC2 = 0
df$x = (df$LP2 + df$SC2 - df$TC2 - df$PSA2) * 100 #-1.45
df$y = (df$LP2 + df$TC2 - df$PSA2 - df$SC2) * 100 #-21.23